Library setup

library(DESeq2)
library(data.table)
library(gdata)
library(rtracklayer)
library(BSgenome)
library(VennDiagram)
library(ggplot2)
library(biomaRt)
library(RColorBrewer)
library(readr)
library(ensembldb)
library(EnsDb.Mmusculus.v79)
library(org.Mm.eg.db)
library(AnnotationDbi)
library(tidyverse)
library(plotly)

CDF analysis using RNA-Seq data

Set up the function

plotCDF.ggplot <- function(gene.counts, gene.sets, gene.set.labels,
                           col = "", linetype = "", xlim = c( -1.0, 1.3 ),
                           legend.size = 22, axistitle.size = 22, title = "Fold change log2 (Dicer KO/WT)",
                           legend.pos = c(0.7, 0.18)) {
  require(ggplot2)
  df.log2fc <- gene.counts[,c("gene", "log2FC")]
  #rownames(df.log2fc) <- df.log2fc$gene
  if (length(gene.sets) != length(gene.set.labels)){
    return("Length of gene sets doesn't match labels")
  }
  target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
  for (i in 2:length(gene.sets)){
    target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
  }

  gene.set.counts <- c()
  for (j in 1:length(gene.sets)){
    gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
  }
  
  target.expr$Category <- rep(gene.set.labels, gene.set.counts)
  target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
  
  log2FC.values <- lapply(gene.sets, function(gene.set) {
    gene.counts[gene.counts$gene %in% gene.set,]$log2FC
  })

  ks.pvals <- lapply(log2FC.values,
                     function(log2FCs) {
                       ks.test(log2FCs, log2FC.values[[1]])$p.value
                     })
  
  p <- ggplot( target.expr, aes( x = log2FC, colour = Category ) ) +
  stat_ecdf( geom = 'step', aes( colour = Category, linetype = Category ), lwd = 1 ) +
  scale_color_manual( values = col, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
  scale_linetype_manual( values = linetype, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
  # xlim() will remove data points; Be careful in the future
  coord_cartesian( xlim = xlim ) + xlab(title) + ylab('CDF') +
  theme_classic() + theme( legend.background = element_rect(fill = NA), 
                           legend.title = element_blank(), 
                           legend.position = legend.pos,
                           legend.text = element_text(size=legend.size),
                           legend.key.size = unit(1.5, 'lines'),
                           axis.title.x = element_text(size=axistitle.size, margin = margin(t = 10)),
                           axis.title.y = element_text(size=axistitle.size, margin = margin(r = 10)),
                           axis.text=element_text(size=20),
                           axis.line = element_line(size = 1),  #axis label size
                           axis.ticks.length = unit(0.3, "cm")) #increase tick length
  
  for (k in 2:length(gene.sets)){
    p <- p + annotate(geom = "text", x = -0.9, y = 1-0.08*(k-1), hjust = 0, 
                      label = sprintf("p = %.0e", ks.pvals[k]), 
                      colour = col[k], size = 8)
  }
  print(p)
}

Load the enema model RNA-Seq DGE dataset.

rna_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/Differential Analysis.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   baseMean = col_double(),
##   log2FoldChange = col_double(),
##   lfcSE = col_double(),
##   stat = col_double(),
##   pvalue = col_double(),
##   padj = col_double()
## )

Now load the list of peaks associated with each miRNA.

peaks.mir <- readRDS("Datafiles/peaks-mirs-200-12232019.rds")

# First we need to nnotate each peak with its potential target gene, 3'UTR annotation gets priority.
peaks.mir$'target_gene' <- NA
for (i in 1:length(peaks.mir)) {
  if (!is.na(peaks.mir$utr3[i]) | !is.na(peaks.mir$`utr3*`[i])) {
    gene_name <- unique(c(peaks.mir$utr3[i],peaks.mir$`utr3*`[i]))
    gene_name <- gene_name[!is.na(gene_name)] 
    peaks.mir$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
  }
  else {
    gene_name <- unique(c(peaks.mir$exon[i], peaks.mir$intron[i],peaks.mir$utr5[i],peaks.mir$`utr5*`[i]))
    gene_name <- gene_name[!is.na(gene_name)]
    if (length(gene_name) >0) {
    peaks.mir$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
    }
  }
}

Now I need to annotate the target list with both Ensembl ID and Uniprot ID and Enterez ID (for KEGG and GSEA).

# annotate with Ensembl ID
# Ensembl IDs are annotated using `EnsDb.Mmusculus.v79` package since that is the one that I used for RNA-Seq analysis
target_gene_list <- peaks.mir$target_gene[!is.na(peaks.mir$target_gene)]
annotations_ensembl <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = as.character(target_gene_list),
                                           columns = c("GENEID"),
                                           keytype = "GENENAME")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_ensembl$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_ensembl <- annotations_ensembl[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_ensembl$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# annotate the dataset with Ensembl ID
peaks.mir$target_Ensembl_ID <- NA
peaks.mir <- peaks.mir[!is.na(peaks.mir$target_gene)]
for (i in 1:length(peaks.mir)) {
  index <- grep(peaks.mir$target_gene[i], annotations_ensembl$GENENAME)
  if (length(index)>0) {
    peaks.mir$target_Ensembl_ID[i] <- paste(annotations_ensembl$GENEID[index], collapse = " ")
  }
}

# annotate the dataset with Uniprot ID
## first I need to generate a list of genes to retrieve Uniprot ID from the website
peak.targets <- unique(peaks.mir$target_gene)
write.csv(peak.targets, "all_peaks.csv")

uniprot_ID <- read_csv("all_peaks_Uniprot.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   gene_name = col_character(),
##   Entry = col_character(),
##   `Entry name` = col_character(),
##   Status = col_character(),
##   `Protein names` = col_character(),
##   `Gene names` = col_character(),
##   Organism = col_character(),
##   Length = col_double()
## )
peaks.mir$target_Uniprot_ID <- NA
for (i in 1:length(peaks.mir)) {
  index <- grep(peaks.mir$target_gene[i], uniprot_ID$gene_name)
  if (length(index)>0) {
    peaks.mir$target_Uniprot_ID[i] <- paste(uniprot_ID$Entry[index], collapse = " ")
  }
}

## filter out targets with no ID
no_id <- setdiff(peak.targets, uniprot_ID$gene_name)
write.csv(no_id, "no_UniprotID.csv")

## load the manually annotated ID list
no.id.2.id <- read_csv("no_UniprotID_2_ID.csv", col_names = FALSE)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_character()
## )
for (i in 1:dim(no.id.2.id)[1]) {
  index <- grep(no.id.2.id$X1[i], peaks.mir$target_gene)
  if (length(index)>0) {
    peaks.mir$target_Uniprot_ID[index] <- paste(no.id.2.id$X2[i], collapse = " ")
  }
}

# number of genes with UniprotID
length(unique(peaks.mir$target_Uniprot_ID[!is.na(peaks.mir$target_Uniprot_ID)]))
## [1] 1732
# annotate with Entrez ID
# Entrez IDs are annotated using `org.Mm.eg.db` package since this is the most updated
annotations_entrez <- AnnotationDbi::select(org.Mm.eg.db,
                                           keys = as.character(target_gene_list),
                                           columns = c("ENTREZID"),
                                           keytype = "SYMBOL")
## 'select()' returned many:1 mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_entrez$ENTREZID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_entrez <- annotations_entrez[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_entrez$ENTREZID) %>%
  which() %>%
  length()
## [1] 1
# annotate the dataset with Entrez ID
peaks.mir$target_Entrez_ID <- NA
peaks.mir <- peaks.mir[!is.na(peaks.mir$target_gene)]
for (i in 1:length(peaks.mir)) {
  index <- grep(peaks.mir$target_gene[i], annotations_entrez$SYMBOL)
  if (length(index)>0) {
    peaks.mir$target_Entrez_ID[i] <- paste(annotations_entrez$ENTREZID[index], collapse = " ")
  }
}

# save the datafile
saveRDS(peaks.mir, "Datafiles/peaks-mirs-200-12232019-withID.rds")

Now we group peaks by miRNA.

targetofmiR <- function(peaks.mir = brain.peaks.mirs,
                        miRNA = "",
                        sitetype = "8mer"){
  peaks.mir.sub <- as.data.frame(peaks.mir[,c("log2FC", "padj", 
                                              "seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")])
  peaks.seedmatch <- lapply(c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer"),
                            function(seed){
                              map <- peaks.mir.sub[grepl(miRNA, peaks.mir.sub[,seed]),]
                              map <- rownames(map)
                              map
                            })
  names(peaks.seedmatch) <- c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")

  if (sitetype == "8mer"){
    maps <- peaks.seedmatch[[1]]
  } else if (sitetype == "7mer_above"){
    maps <- unique(unlist(peaks.seedmatch[1:3]))
  } else if (sitetype == "7mer"){
    maps <- unique(unlist(peaks.seedmatch[2:3]))
  } else if (sitetype == "6mer"){
    maps <- peaks.seedmatch[[4]]
  } else {
    print("Please input site type as: 8mer, 7mer_above, 7mer or 6mer")
  }
    return(peaks.mir[maps])
}
mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-12232019.rds")
mirs <- subset(mirna.family.DGE, baseMean > 200)
mirs <- mirs$miR.family

mirs.peaks <- lapply(mirs,
                     function(mir){
                       targetofmiR(miRNA = mir,
                                   peaks.mir = peaks.mir,
                                   sitetype = "7mer_above")
                     })
names(mirs.peaks) <- mirs
saveRDS(mirs.peaks, "Datafiles/miRNA-peaks-list-12232019-withIDs.rds")

Now we can draw the CDF plots for specific miRNAs. I will do the top 10 that have the most peaks associated.

colnames(rna_DGE)[1] <- "gene"
colnames(rna_DGE)[3] <- "log2FC"
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(rna_DGE,
               list(rna_DGE$gene, peaks.mir$target_Ensembl_ID),
               c("All genes", "miRNAs over 200"),
               col = c("grey15", cols[1]),
               linetype = c(1, 1),
               title = "RNA_LFC(K-Ras_G12D/WT)"
               )
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)

for (i in 1:10) {
  plotCDF.ggplot(rna_DGE,
               list(rna_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
               c("All genes", mirna[i]),
               col = c("grey15", cols[i]),
               linetype = c(1, 1),
               title = "RNA_LFC(K-Ras_G12D/WT)"
               )
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

Boxplot

# define the function
plotBoxplot.ggplot <- function(gene.counts, gene.sets, gene.set.labels) {
  require(ggplot2)
  df.log2fc <- gene.counts[,c("gene", "log2FC")]
  #rownames(df.log2fc) <- df.log2fc$gene
  if (length(gene.sets) != length(gene.set.labels)){
    return("Length of gene sets doesn't match labels")
  }
  target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
  for (i in 2:length(gene.sets)){
    target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
  }

  gene.set.counts <- c()
  for (j in 1:length(gene.sets)){
    gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
  }
  
  target.expr$Category <- rep(gene.set.labels, gene.set.counts)
  target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
  
  p <- ggplot(target.expr, aes(x=Category, y=log2FC, fill = Category)) + geom_boxplot() + xlab("") + 
  ylab("log2FC(KRas-G12D/WT)") + scale_fill_manual(values = cols[1:length(gene.sets)]) + theme(axis.text.x =element_text(angle = 90, hjust = 1))
  
  print(p)
}
#get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
  gene.list <- c(gene.list, list(mirs.peaks[[i]]$target_Ensembl_ID))
}

plotBoxplot.ggplot(rna_DGE,
               c(list(rna_DGE$gene), gene.list),
               c("All genes", mirna[1:10])
               ) + ylim(-2,2)

## Warning: Removed 122 rows containing non-finite values (stat_boxplot).

CAD analysis using proteomics data

Load the proteomic dataset

protein_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/ceMS_diff.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_double(),
##   `Protein Id` = col_character(),
##   X2 = col_character(),
##   `Gene Symbol` = col_character(),
##   p_values = col_double(),
##   q_values = col_double(),
##   foldChange = col_double(),
##   LFC = col_double()
## )
colnames(protein_DGE)[2] <- "gene"
colnames(protein_DGE)[8] <- "log2FC"
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(protein_DGE,
               list(protein_DGE$gene, peaks.mir$target_Uniprot_ID),
               c("All genes", "miRNAs over 200"),
               col = c("grey15", cols[1]),
               linetype = c(1, 1),
               title = "Protein_LFC(K-Ras_G12D/WT)"
               )
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

for (i in 1:10) {
  plotCDF.ggplot(protein_DGE,
               list(protein_DGE$gene, mirs.peaks[[i]]$target_Uniprot_ID),
               c("All genes", mirna[i]),
               col = c("grey15", cols[i]),
               linetype = c(1, 1),
               title = "Protein_LFC(K-Ras_G12D/WT)"
               )
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

Boxplots

# get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
  gene.list <- c(gene.list, list(mirs.peaks[[i]]$target_Uniprot_ID))
}

plotBoxplot.ggplot(protein_DGE,
               c(list(protein_DGE$gene), gene.list),
               c("All genes", mirna[1:10])
               ) + ylim(-1,1)

## Warning: Removed 359 rows containing non-finite values (stat_boxplot).

HEAP-CLIP peak intensity

This analysis will use the LFC of CLIP peak heights between HVAK and HVA samples.

CDF plots

clip_DGE <- as.data.frame(cbind(peaks.mir$name, peaks.mir$hfk.hf.log2FC))
colnames(clip_DGE)[1] <- "gene"
colnames(clip_DGE)[2] <- "log2FC"
clip_DGE$log2FC <- as.numeric(as.character(clip_DGE$log2FC))
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))

for (i in 1:10) {
  plotCDF.ggplot(clip_DGE,
               list(clip_DGE$gene, mirs.peaks[[i]]$name),
               c("All peaks", mirna[i]),
               col = c("grey15", cols[i]),
               linetype = c(1, 1),
               title = "HEAP-CLIP_LFC(K-Ras_G12D/WT)",
               xlim = c(-8,8)
               )
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties

Box plots

# get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
  gene.list <- c(gene.list, list(mirs.peaks[[i]]$name))
}

plotBoxplot.ggplot(clip_DGE,
               c(list(clip_DGE$gene), gene.list),
               c("All genes", mirna[1:10])
               )

Correlate miRNA abundance with associated peak number

# filter for miRNA with mean counts  > 200
mirna.family.DGE <- mirna.family.DGE[mirna.family.DGE$baseMean>200,]
mirna.family.DGE$peak.number <- NA
for (i in 1:length(mirna.family.DGE$miR.family)) {
  peak.number <- length(mirs.peaks[[i]])
  index <- grep(names(mirs.peaks)[i], mirna.family.DGE$miR.family)
  mirna.family.DGE$peak.number[index] <- peak.number
}

p1 <- ggplot(mirna.family.DGE, aes(x = log2(baseMean), y = peak.number, label = miR.family, size = peak.number)) +
  geom_point(colour = "#EC469A", alpha = 0.6) +
  geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
  xlab("Log2(Ago2-bound miRNA abundance)") +
  ylab("Number of HEAP-CLIP peaks mapped") +
  theme_bw() +
      theme(panel.border = element_blank(),
      panel.background = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.title.x = element_text(size=12, margin = margin(t = 10)),
      axis.title.y = element_text(size=12, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_line(size = 0.5),
      axis.line.x = element_line(size = 0.5),
      axis.ticks.x = element_line(size = 0),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 12, face = "bold")) + 
  xlim(5,18)
  
p1 <- ggplotly(p1)
p1
# calculate the correlation between these two factors
cor.test(log10(mirna.family.DGE$baseMean), mirna.family.DGE$peak.number)
## 
##  Pearson's product-moment correlation
## 
## data:  log10(mirna.family.DGE$baseMean) and mirna.family.DGE$peak.number
## t = 5.7554, df = 83, p-value = 1.409e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3621907 0.6708509
## sample estimates:
##       cor 
## 0.5340877

SessionInfo

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] plotly_4.9.3                forcats_0.5.1              
##  [3] stringr_1.4.0               dplyr_1.0.5                
##  [5] purrr_0.3.4                 tidyr_1.1.3                
##  [7] tibble_3.1.0                tidyverse_1.3.0            
##  [9] org.Mm.eg.db_3.11.4         EnsDb.Mmusculus.v79_2.99.0 
## [11] ensembldb_2.12.1            AnnotationFilter_1.12.0    
## [13] GenomicFeatures_1.40.1      AnnotationDbi_1.50.3       
## [15] readr_1.4.0                 RColorBrewer_1.1-2         
## [17] biomaRt_2.44.4              ggplot2_3.3.3              
## [19] VennDiagram_1.6.20          futile.logger_1.4.3        
## [21] BSgenome_1.56.0             Biostrings_2.56.0          
## [23] XVector_0.28.0              rtracklayer_1.48.0         
## [25] gdata_2.18.0                data.table_1.14.0          
## [27] DESeq2_1.28.1               SummarizedExperiment_1.18.2
## [29] DelayedArray_0.14.1         matrixStats_0.58.0         
## [31] Biobase_2.48.0              GenomicRanges_1.40.0       
## [33] GenomeInfoDb_1.24.2         IRanges_2.22.2             
## [35] S4Vectors_0.26.1            BiocGenerics_0.34.0        
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0         ellipsis_0.3.1           fs_1.5.0                
##  [4] rstudioapi_0.13          farver_2.1.0             bit64_4.0.5             
##  [7] lubridate_1.7.10         fansi_0.4.2              xml2_1.3.2              
## [10] splines_4.0.3            cachem_1.0.4             geneplotter_1.66.0      
## [13] knitr_1.31               jsonlite_1.7.2           Rsamtools_2.4.0         
## [16] broom_0.7.5              annotate_1.66.0          dbplyr_2.1.0            
## [19] compiler_4.0.3           httr_1.4.2               backports_1.2.1         
## [22] assertthat_0.2.1         Matrix_1.3-2             fastmap_1.1.0           
## [25] lazyeval_0.2.2           cli_2.3.1                formatR_1.8             
## [28] htmltools_0.5.1.1        prettyunits_1.1.1        tools_4.0.3             
## [31] gtable_0.3.0             glue_1.4.2               GenomeInfoDbData_1.2.3  
## [34] rappdirs_0.3.3           Rcpp_1.0.6               cellranger_1.1.0        
## [37] jquerylib_0.1.3          vctrs_0.3.7              crosstalk_1.1.1         
## [40] xfun_0.22                rvest_1.0.0              lifecycle_1.0.0         
## [43] gtools_3.8.2             XML_3.99-0.6             zlibbioc_1.34.0         
## [46] scales_1.1.1             hms_1.0.0                ProtGenerics_1.20.0     
## [49] lambda.r_1.2.4           yaml_2.2.1               curl_4.3                
## [52] memoise_2.0.0            sass_0.3.1               stringi_1.5.3           
## [55] RSQLite_2.2.5            highr_0.8                genefilter_1.70.0       
## [58] BiocParallel_1.22.0      rlang_0.4.10             pkgconfig_2.0.3         
## [61] bitops_1.0-6             evaluate_0.14            lattice_0.20-41         
## [64] labeling_0.4.2           htmlwidgets_1.5.3        GenomicAlignments_1.24.0
## [67] bit_4.0.4                tidyselect_1.1.0         magrittr_2.0.1          
## [70] R6_2.5.0                 generics_0.1.0           DBI_1.1.1               
## [73] haven_2.3.1              pillar_1.5.1             withr_2.4.1             
## [76] survival_3.2-10          RCurl_1.98-1.3           modelr_0.1.8            
## [79] crayon_1.4.1             futile.options_1.0.1     utf8_1.2.1              
## [82] BiocFileCache_1.12.1     rmarkdown_2.7            progress_1.2.2          
## [85] readxl_1.3.1             locfit_1.5-9.4           blob_1.2.1              
## [88] reprex_1.0.0             digest_0.6.27            xtable_1.8-4            
## [91] openssl_1.4.3            munsell_0.5.0            viridisLite_0.3.0       
## [94] bslib_0.2.4              askpass_1.1